You are now the data slave to the principal investigator Dr. Vinca Monster. Dr. M is in the Grape Program at State U, and you are just a poor graduate student trying to get your degree. Dr. M is interested in wine preferences and the influences of physico-chemical properties on preferences. Her laboratory has gathered an extensive dataset on Portugese white varietals.

You will find the white_wines.csv file and its description in my github repo (https://github.com/vhertzb/Regression-1)[https://github.com/vhertzb/Regression-1].

Please use the techniques you have learned in the last two classes, specifically exploratory data analysis and linear regression, to determine association of the wine properties on preference.

Prepare a report for presentation at the next Monster lab meeting about this dataset.

Rubric:

Exploration (summary statistics (the m’s), univariate graphs, multivariate graphs) Regression (Models explored, diagnostics completed, final model choice, justification)

Please include a concluding paragraph (or two) about the implications of your findings.

#load up necessary packages
library(HistData)
library(car)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2. http://CRAN.R-project.org/package=stargazer
#read in dataset
library(readr)
White_wines <- read_csv("~/Documents/Big Data Class/N741 Data Wrangling/InClass2.1/Regression-1/White_wines.csv")
## Parsed with column specification:
## cols(
##   `fixed acidity` = col_double(),
##   `volatile acidity` = col_double(),
##   `citric acid` = col_double(),
##   `residual sugar` = col_double(),
##   chlorides = col_double(),
##   `free sulfur dioxide` = col_double(),
##   `total sulfur dioxide` = col_double(),
##   density = col_double(),
##   pH = col_double(),
##   sulphates = col_double(),
##   alcohol = col_double(),
##   quality = col_integer()
## )
#Why did I have to do below?
names(White_wines) <- make.names(names(White_wines))
# see what is in the dataset
summary(White_wines)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0       
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0       
##  Median :0.04300   Median : 34.00      Median :134.0       
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4       
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0       
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol     
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.4700   Median :10.40  
##  Mean   :0.9940   Mean   :3.188   Mean   :0.4898   Mean   :10.51  
##  3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.820   Max.   :1.0800   Max.   :14.20  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.878  
##  3rd Qu.:6.000  
##  Max.   :9.000
library(Rcmdr)
## Loading required package: splines
## Loading required package: RcmdrMisc
## Loading required package: sandwich
## The Commander GUI is launched only in interactive sessions
scatterplotMatrix(~quality+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
   reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE, 
  levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)

scatterplotMatrix(~quality+pH+density+free.sulfur.dioxide+chlorides,
   reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE, 
  levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)

scatterplotMatrix(~quality+citric.acid+fixed.acidity+alcohol,
   reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE, 
  levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)

Free sulfur dioxide, fixed acidity, and density seem to have abnormal residual distributions, so I logged them below.

White_wines$free.sulf.diox.log <- with(White_wines, 
  log2(free.sulfur.dioxide))
White_wines$fixed.acidity.log <- with(White_wines, log2(fixed.acidity))
White_wines$density.log <- with(White_wines, log2(density))
scatterplotMatrix(~quality+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
   reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE, 
  levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)

scatterplotMatrix(~quality+pH+density.log+free.sulf.diox.log+chlorides,
   reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE, 
  levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)

scatterplotMatrix(~quality+citric.acid+fixed.acidity.log+alcohol,
   reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE, 
  levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)

Those transformations help a little bit.

cor(White_wines[,c("quality","alcohol","chlorides","citric.acid","density.log","fixed.acidity.log","free.sulf.diox.log","pH","residual.sugar","sulphates","total.sulfur.dioxide","volatile.acidity")], use="complete")
##                           quality     alcohol   chlorides  citric.acid
## quality               1.000000000  0.43557472 -0.20993441 -0.009209091
## alcohol               0.435574715  1.00000000 -0.36018871 -0.075728730
## chlorides            -0.209934411 -0.36018871  1.00000000  0.114364448
## citric.acid          -0.009209091 -0.07572873  0.11436445  1.000000000
## density.log          -0.307723788 -0.78135429  0.25757406  0.149442828
## fixed.acidity.log    -0.109736681 -0.13108514  0.03305563  0.292566248
## free.sulf.diox.log    0.099058582 -0.22409197  0.09168564  0.084276395
## pH                    0.099427246  0.12143210 -0.09043946 -0.163748211
## residual.sugar       -0.097576829 -0.45063122  0.08868454  0.094211624
## sulphates             0.053677877 -0.01743277  0.01676288  0.062330940
## total.sulfur.dioxide -0.174737218 -0.44889210  0.19891030  0.121130798
## volatile.acidity     -0.194722969  0.06771794  0.07051157 -0.149471811
##                      density.log fixed.acidity.log free.sulf.diox.log
## quality              -0.30772379       -0.10973668         0.09905858
## alcohol              -0.78135429       -0.13108514        -0.22409197
## chlorides             0.25757406        0.03305563         0.09168564
## citric.acid           0.14944283        0.29256625         0.08427640
## density.log           1.00000000        0.27695036         0.28317156
## fixed.acidity.log     0.27695036        1.00000000        -0.04534913
## free.sulf.diox.log    0.28317156       -0.04534913         1.00000000
## pH                   -0.09368819       -0.43478921         0.02199554
## residual.sugar        0.83864966        0.10237716         0.30293472
## sulphates             0.07444942       -0.01415546         0.06084248
## total.sulfur.dioxide  0.53044357        0.10259928         0.59619976
## volatile.acidity      0.02661505       -0.02974209        -0.11663198
##                                pH residual.sugar   sulphates
## quality               0.099427246    -0.09757683  0.05367788
## alcohol               0.121432099    -0.45063122 -0.01743277
## chlorides            -0.090439456     0.08868454  0.01676288
## citric.acid          -0.163748211     0.09421162  0.06233094
## density.log          -0.093688189     0.83864966  0.07444942
## fixed.acidity.log    -0.434789207     0.10237716 -0.01415546
## free.sulf.diox.log    0.021995543     0.30293472  0.06084248
## pH                    1.000000000    -0.19413345  0.15595150
## residual.sugar       -0.194133454     1.00000000 -0.02666437
## sulphates             0.155951497    -0.02666437  1.00000000
## total.sulfur.dioxide  0.002320972     0.40143931  0.13456237
## volatile.acidity     -0.031915368     0.06428606 -0.03572815
##                      total.sulfur.dioxide volatile.acidity
## quality                      -0.174737218      -0.19472297
## alcohol                      -0.448892102       0.06771794
## chlorides                     0.198910300       0.07051157
## citric.acid                   0.121130798      -0.14947181
## density.log                   0.530443572       0.02661505
## fixed.acidity.log             0.102599278      -0.02974209
## free.sulf.diox.log            0.596199757      -0.11663198
## pH                            0.002320972      -0.03191537
## residual.sugar                0.401439311       0.06428606
## sulphates                     0.134562367      -0.03572815
## total.sulfur.dioxide          1.000000000       0.08926050
## volatile.acidity              0.089260504       1.00000000

The variable most correlated with quality is alcohol, so I will use that as my primary independent variable. Density is highly correlated with alcohol (r=-.78) and residual sugar (r=.84),and free sulfur dioxide less so with total sulfur dioxide (r=.59), so I will consider whether these variables all need to be in the model.

Regression with training data.

RegModel.1 <- 
  lm(quality~alcohol+chlorides+citric.acid+density.log+fixed.acidity.log+free.sulf.diox.log+pH+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
   data=White_wines)
summary(RegModel.1)
## 
## Call:
## lm(formula = quality ~ alcohol + chlorides + citric.acid + density.log + 
##     fixed.acidity.log + free.sulf.diox.log + pH + residual.sugar + 
##     sulphates + total.sulfur.dioxide + volatile.acidity, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4172 -0.5008 -0.0287  0.4585  3.0836 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.249e+00  5.481e-01  -2.279 0.022726 *  
## alcohol               1.976e-01  2.411e-02   8.199 3.07e-16 ***
## chlorides            -4.037e-01  5.383e-01  -0.750 0.453354    
## citric.acid           3.611e-03  9.443e-02   0.038 0.969494    
## density.log          -9.368e+01  1.311e+01  -7.144 1.04e-12 ***
## fixed.acidity.log     3.700e-01  9.982e-02   3.707 0.000212 ***
## free.sulf.diox.log    2.163e-01  1.780e-02  12.155  < 2e-16 ***
## pH                    6.609e-01  1.052e-01   6.285 3.57e-10 ***
## residual.sugar        7.305e-02  7.478e-03   9.768  < 2e-16 ***
## sulphates             6.300e-01  9.898e-02   6.364 2.14e-10 ***
## total.sulfur.dioxide -1.901e-03  3.695e-04  -5.146 2.76e-07 ***
## volatile.acidity     -1.651e+00  1.130e-01 -14.615  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7412 on 4886 degrees of freedom
## Multiple R-squared:  0.3012, Adjusted R-squared:  0.2996 
## F-statistic: 191.4 on 11 and 4886 DF,  p-value: < 2.2e-16

Interpretation: Density standard error is very high.

RegModel.1.2 <- 
  lm(quality~alcohol+chlorides+citric.acid+density+fixed.acidity+free.sulfur.dioxide+pH+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
   data=White_wines)
summary(RegModel.1.2)
## 
## Call:
## lm(formula = quality ~ alcohol + chlorides + citric.acid + density + 
##     fixed.acidity + free.sulfur.dioxide + pH + residual.sugar + 
##     sulphates + total.sulfur.dioxide + volatile.acidity, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8348 -0.4934 -0.0379  0.4637  3.1143 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.502e+02  1.880e+01   7.987 1.71e-15 ***
## alcohol               1.935e-01  2.422e-02   7.988 1.70e-15 ***
## chlorides            -2.473e-01  5.465e-01  -0.452  0.65097    
## citric.acid           2.209e-02  9.577e-02   0.231  0.81759    
## density              -1.503e+02  1.907e+01  -7.879 4.04e-15 ***
## fixed.acidity         6.552e-02  2.087e-02   3.139  0.00171 ** 
## free.sulfur.dioxide   3.733e-03  8.441e-04   4.422 9.99e-06 ***
## pH                    6.863e-01  1.054e-01   6.513 8.10e-11 ***
## residual.sugar        8.148e-02  7.527e-03  10.825  < 2e-16 ***
## sulphates             6.315e-01  1.004e-01   6.291 3.44e-10 ***
## total.sulfur.dioxide -2.857e-04  3.781e-04  -0.756  0.44979    
## volatile.acidity     -1.863e+00  1.138e-01 -16.373  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2803 
## F-statistic: 174.3 on 11 and 4886 DF,  p-value: < 2.2e-16

The logged variables seem to matter.

RegModel.2 <- 
  lm(quality~alcohol+chlorides+citric.acid+fixed.acidity.log+free.sulf.diox.log+pH+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
   data=White_wines)
summary(RegModel.2)
## 
## Call:
## lm(formula = quality ~ alcohol + chlorides + citric.acid + fixed.acidity.log + 
##     free.sulf.diox.log + pH + residual.sugar + sulphates + total.sulfur.dioxide + 
##     volatile.acidity, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3168 -0.5017 -0.0267  0.4521  3.1138 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.3159630  0.4162258   3.162  0.00158 ** 
## alcohol               0.3505671  0.0111413  31.465  < 2e-16 ***
## chlorides            -1.0109423  0.5342996  -1.892  0.05854 .  
## citric.acid          -0.0418197  0.0946938  -0.442  0.65878    
## fixed.acidity.log    -0.1341510  0.0709588  -1.891  0.05874 .  
## free.sulf.diox.log    0.2386673  0.0176120  13.551  < 2e-16 ***
## pH                    0.1830373  0.0815589   2.244  0.02486 *  
## residual.sugar        0.0227248  0.0025233   9.006  < 2e-16 ***
## sulphates             0.4414986  0.0958906   4.604 4.25e-06 ***
## total.sulfur.dioxide -0.0024369  0.0003636  -6.702 2.29e-11 ***
## volatile.acidity     -1.7368228  0.1129034 -15.383  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.745 on 4887 degrees of freedom
## Multiple R-squared:  0.2939, Adjusted R-squared:  0.2924 
## F-statistic: 203.4 on 10 and 4887 DF,  p-value: < 2.2e-16

Interpretation: for each increase in parent height of 1 inch, the child height increases by 0.65 inches. With density dropped, there are no anomolous standard errors. R-squared decreases slightly, but not enough to be practically significant.

RegModel.3 <- 
  lm(quality~alcohol+chlorides+fixed.acidity.log+free.sulf.diox.log+pH+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
   data=White_wines)
summary(RegModel.3)
## 
## Call:
## lm(formula = quality ~ alcohol + chlorides + fixed.acidity.log + 
##     free.sulf.diox.log + pH + residual.sugar + sulphates + total.sulfur.dioxide + 
##     volatile.acidity, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3099 -0.5021 -0.0274  0.4508  3.1118 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.3230570  0.4158815   3.181  0.00148 ** 
## alcohol               0.3502022  0.0111097  31.522  < 2e-16 ***
## chlorides            -1.0387422  0.5305348  -1.958  0.05030 .  
## fixed.acidity.log    -0.1416017  0.0689184  -2.055  0.03997 *  
## free.sulf.diox.log    0.2385761  0.0176094  13.548  < 2e-16 ***
## pH                    0.1848631  0.0814473   2.270  0.02327 *  
## residual.sugar        0.0226647  0.0025194   8.996  < 2e-16 ***
## sulphates             0.4389541  0.0957094   4.586 4.62e-06 ***
## total.sulfur.dioxide -0.0024471  0.0003629  -6.744 1.72e-11 ***
## volatile.acidity     -1.7284995  0.1113101 -15.529  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7449 on 4888 degrees of freedom
## Multiple R-squared:  0.2938, Adjusted R-squared:  0.2925 
## F-statistic:   226 on 9 and 4888 DF,  p-value: < 2.2e-16

Let’s see what happens without density and then citric acid.

# compare the results of the two regression models
stargazer(RegModel.1,RegModel.2,RegModel.3,title="Comparison of 2 Regression outputs",type="html",align=TRUE)
Comparison of 2 Regression outputs
Dependent variable:
quality
(1) (2) (3)
alcohol 0.198*** 0.351*** 0.350***
(0.024) (0.011) (0.011)
chlorides -0.404 -1.011* -1.039*
(0.538) (0.534) (0.531)
citric.acid 0.004 -0.042
(0.094) (0.095)
density.log -93.679***
(13.113)
fixed.acidity.log 0.370*** -0.134* -0.142**
(0.100) (0.071) (0.069)
free.sulf.diox.log 0.216*** 0.239*** 0.239***
(0.018) (0.018) (0.018)
pH 0.661*** 0.183** 0.185**
(0.105) (0.082) (0.081)
residual.sugar 0.073*** 0.023*** 0.023***
(0.007) (0.003) (0.003)
sulphates 0.630*** 0.441*** 0.439***
(0.099) (0.096) (0.096)
total.sulfur.dioxide -0.002*** -0.002*** -0.002***
(0.0004) (0.0004) (0.0004)
volatile.acidity -1.651*** -1.737*** -1.728***
(0.113) (0.113) (0.111)
Constant -1.249** 1.316*** 1.323***
(0.548) (0.416) (0.416)
Observations 4,898 4,898 4,898
R2 0.301 0.294 0.294
Adjusted R2 0.300 0.292 0.293
Residual Std. Error 0.741 (df = 4886) 0.745 (df = 4887) 0.745 (df = 4888)
F Statistic 191.408*** (df = 11; 4886) 203.363*** (df = 10; 4887) 225.974*** (df = 9; 4888)
Note: p<0.1; p<0.05; p<0.01
# diagnostics using residual plots
residualPlots(RegModel.1)

##                      Test stat Pr(>|t|)
## alcohol                  5.270    0.000
## chlorides                1.403    0.161
## citric.acid             -4.424    0.000
## density.log              5.229    0.000
## fixed.acidity.log       -3.103    0.002
## free.sulf.diox.log     -11.193    0.000
## pH                       0.964    0.335
## residual.sugar           2.481    0.013
## sulphates                0.047    0.963
## total.sulfur.dioxide    -8.039    0.000
## volatile.acidity         3.210    0.001
## Tukey test               2.656    0.008
residualPlots(RegModel.2)

##                      Test stat Pr(>|t|)
## alcohol                  5.440    0.000
## chlorides                2.085    0.037
## citric.acid             -4.385    0.000
## fixed.acidity.log       -3.854    0.000
## free.sulf.diox.log     -10.941    0.000
## pH                       0.287    0.774
## residual.sugar          -1.326    0.185
## sulphates                0.051    0.959
## total.sulfur.dioxide    -7.709    0.000
## volatile.acidity         1.905    0.057
## Tukey test               1.316    0.188
residualPlots(RegModel.3)

##                      Test stat Pr(>|t|)
## alcohol                  5.393    0.000
## chlorides                2.077    0.038
## fixed.acidity.log       -3.864    0.000
## free.sulf.diox.log     -10.951    0.000
## pH                       0.279    0.780
## residual.sugar          -1.343    0.179
## sulphates                0.052    0.958
## total.sulfur.dioxide    -7.711    0.000
## volatile.acidity         1.911    0.056
## Tukey test               1.312    0.190
White_wines$tot.sulf.diox.log <- with(White_wines, 
  log2(total.sulfur.dioxide))
RegModel.4 <- 
  lm(quality~alcohol+chlorides+fixed.acidity.log+free.sulf.diox.log+pH+residual.sugar+sulphates+tot.sulf.diox.log+volatile.acidity,
   data=White_wines)
summary(RegModel.4)
## 
## Call:
## lm(formula = quality ~ alcohol + chlorides + fixed.acidity.log + 
##     free.sulf.diox.log + pH + residual.sugar + sulphates + tot.sulf.diox.log + 
##     volatile.acidity, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4389 -0.5015 -0.0234  0.4476  3.1550 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.014293   0.434549   4.635 3.66e-06 ***
## alcohol             0.363387   0.011044  32.903  < 2e-16 ***
## chlorides          -1.103123   0.532734  -2.071  0.03844 *  
## fixed.acidity.log  -0.182052   0.069065  -2.636  0.00842 ** 
## free.sulf.diox.log  0.206141   0.018166  11.348  < 2e-16 ***
## pH                  0.152857   0.081812   1.868  0.06177 .  
## residual.sugar      0.021325   0.002524   8.447  < 2e-16 ***
## sulphates           0.392168   0.095962   4.087 4.45e-05 ***
## tot.sulf.diox.log  -0.103892   0.032194  -3.227  0.00126 ** 
## volatile.acidity   -1.829711   0.110961 -16.490  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7476 on 4888 degrees of freedom
## Multiple R-squared:  0.2888, Adjusted R-squared:  0.2875 
## F-statistic: 220.5 on 9 and 4888 DF,  p-value: < 2.2e-16
residualPlots(RegModel.4)

##                    Test stat Pr(>|t|)
## alcohol                5.232    0.000
## chlorides              2.258    0.024
## fixed.acidity.log     -3.871    0.000
## free.sulf.diox.log   -11.958    0.000
## pH                     0.352    0.725
## residual.sugar        -1.086    0.277
## sulphates              0.383    0.702
## tot.sulf.diox.log     -8.952    0.000
## volatile.acidity       2.338    0.019
## Tukey test             1.060    0.289
RegModel.5 <- 
  lm(quality~alcohol+chlorides+fixed.acidity.log+pH+residual.sugar+sulphates+volatile.acidity,
   data=White_wines)
summary(RegModel.5)
## 
## Call:
## lm(formula = quality ~ alcohol + chlorides + fixed.acidity.log + 
##     pH + residual.sugar + sulphates + volatile.acidity, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4166 -0.4956 -0.0341  0.4640  3.1425 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.498311   0.414340   6.030 1.76e-09 ***
## alcohol            0.364534   0.010735  33.957  < 2e-16 ***
## chlorides         -0.876093   0.539098  -1.625 0.104204    
## fixed.acidity.log -0.268134   0.069156  -3.877 0.000107 ***
## pH                 0.164578   0.082277   2.000 0.045524 *  
## residual.sugar     0.027958   0.002453  11.400  < 2e-16 ***
## sulphates          0.415638   0.096470   4.308 1.68e-05 ***
## volatile.acidity  -2.078582   0.109253 -19.025  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7587 on 4890 degrees of freedom
## Multiple R-squared:  0.2671, Adjusted R-squared:  0.2661 
## F-statistic: 254.6 on 7 and 4890 DF,  p-value: < 2.2e-16
residualPlots(RegModel.5)

##                   Test stat Pr(>|t|)
## alcohol               5.776    0.000
## chlorides             2.403    0.016
## fixed.acidity.log    -4.494    0.000
## pH                   -0.300    0.764
## residual.sugar       -2.426    0.015
## sulphates             0.482    0.630
## volatile.acidity      2.319    0.020
## Tukey test            0.087    0.930

Lose too much R-squared. Stick with 4

#added variable plots
avPlots(RegModel.4, id.n=3, id.cex=0.7)

#id.n - identify n most influential observations
#id.cex - controls the size of the dot
# run the qq-plot
qqPlot(RegModel.4, id.n=3)

## 4746 3308  775 
##    1    2 4898
# here, id.n identifies the n observations with the largest residuals in absolute value

Are there any outliers?

#run Bonferroni test for outliers
outlierTest(RegModel.4)
##       rstudent unadjusted p-value Bonferonni p
## 4746 -4.622511         3.8889e-06     0.019048
## 3308 -4.498933         6.9880e-06     0.034227

Are there any points that are of high influence?

#identify highly influential points
influenceIndexPlot(RegModel.4, id.n=3)

NB. If there are points that are a) outliers AND b) highly influential, these have potential to change the inference. You should consider removing them.

How do we make heads or tails out of the plots above? One way is with an influence plot.

#make influence plot
influencePlot(RegModel.4, id.n=3)

##          StudRes         Hat        CookD
## 485  -0.46699974 0.044148299 1.007456e-03
## 741  -4.21888688 0.009063432 1.622380e-02
## 775   4.23185426 0.002029076 3.628626e-03
## 1218  0.82199081 0.032476245 2.268125e-03
## 2782  0.09801707 0.056049846 5.705804e-05
## 3308 -4.49893274 0.003901068 7.895759e-03
## 3902 -2.76136044 0.018625734 1.445229e-02
## 4746 -4.62251086 0.005604332 1.199263e-02

Another diagnostic is to test for heteroskedasticity (i.e., the variance of the error term is not constant).

#test for heteroskedasticity
ncvTest(RegModel.4)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 12.49315    Df = 1     p = 0.0004084476

We also want to look for multicollinearity, that is are some of our independent variables highly correlated. We do this by looking at the Variance Inflation Factor (VIF). A GVIF > 4 suggests collinearity.

vif(RegModel.4)
##            alcohol          chlorides  fixed.acidity.log 
##           1.618513           1.186995           1.285326 
## free.sulf.diox.log                 pH     residual.sugar 
##           1.804186           1.337197           1.436486 
##          sulphates  tot.sulf.diox.log   volatile.acidity 
##           1.050918           2.135394           1.096020